Library necessary packages

library(NMF)
library(rsvd)
library(Rtsne)
library(ggplot2)
library(cowplot)
library(sva)
library(igraph)
library(cccd)
library(KernSmooth)
library(beeswarm)
library(stringr)
library(reshape2)
library(formatR)
library(destiny)
source("Fxns.R")

Load data

atlas_umis <- load_data(data_name = "./Extend_data/GSE92332_atlas_UMIcounts.txt.gz")
## [1] "File exists!"
## 2017-12-25 09:10:40 INFO: Data dimensions: 15971x7216
atlas_umis <- atlas_umis[which(unlist(apply(atlas_umis, 1, sum)) > 0), ]
atlas_tpm = data.frame(log2(1 + tpm(atlas_umis)))
## 2017-12-25 09:10:49 INFO: Running TPM normalisation

Select variables

v = get.variable.genes(atlas_umis, min.cv2 = 100)
## 2017-12-25 09:11:27 INFO: Fitting only the 9723 genes with mean expression > 0.0330515521064302

## 2017-12-25 09:11:28 INFO: Found 2892 variable genes (p<0.05)
var.genes = as.character(rownames(v)[v$p.adj < 0.05])  # select genes

Based on the previous analysis,we had knowned that the data exactly had batch effect.So we remove the batch effect before follow analysis

get_field = function(string, field = 1, delim = "_", fixed = T) return(strsplit(string, 
    delim, fixed = fixed)[[1]][field])
batch.labels = factor(unlist(lapply(colnames(atlas_umis), get_field, 1, "_")))
atlas_tpm_norm = batch.normalise.comBat(counts = as.matrix(atlas_tpm), batch.groups = batch.labels)
## Standardizing Data across genes
sample.names <- colnames(atlas_tpm_norm)
cell.types <- unlist(lapply(sample.names, function(x) return(str_split(x, "_")[[1]][3])))

Diffusion Map analysis

diffusion_matrix <- cbind(data.frame(Cell = sample.names), t(atlas_tpm_norm[var.genes, 
    ]))
rownames(diffusion_matrix) <- 1:dim(diffusion_matrix)[1]


cells.1 <- c("Stem", "Enterocyte.Progenitor", "Enterocyte.Progenitor.Early", 
    "Enterocyte.Progenitor.Late", "Enterocyte.Immature.Distal", "Enterocyte.Immature.Proximal")
# ct.1 <- as.ExpressionSet(diffusion_matrix[cell.types%in%cells.1,])
# dif.1<-DiffusionMap(ct.1,verbose = T,vars = NULL) # 奇怪,这次样本没有减少
# ???

# save(dif,file='Atlas_UMIs_DiffusionMap.RData')
load("Atlas_UMIs_DiffusionMap_1.RData")
DC <- as.data.frame(eigenvectors(dif))

Figure a

Diffusion components 1 and 3 (DC-1 and DC-3) are associated with the transition from stem cells to progenitors

cell.types.2 <- unlist(lapply(cell.types[cell.types %in% cells.1], function(x) {
    if (str_detect(x, "\\.")) {
        return(str_split(x, "\\.")[[1]][2])
    } else {
        return(str_split(x, "\\.")[[1]][1])
    }
}))


ggplot(DC, aes(x = DC1, y = DC3)) + geom_point(aes(color = cell.types[cell.types %in% 
    cells.1])) + scale_color_manual(values = brewer16) + scale_fill_discrete() + 
    theme(legend.title = element_blank()) + ggtitle("Enterocyte maturation")

# annotate(c('Immature','Progenitor','Stem'),x=c(-0.050,0.000,0.025),y=c(-0.025,0.050,-0.025))

Figure b

cells.2 <- c("Enterocyte.Immature.Distal", "Enterocyte.Immature.Proximal", "Enterocyte.Progenitor.Early", 
    "Enterocyte.Progenitor.Late", "Stem", "TA", "TA.G1", "TA.G2")


# ct.2<- as.ExpressionSet(diffusion_matrix[cell.types%in%cells.2,])
# dif.2<-DiffusionMap(ct.2,verbose = T,vars = NULL) # not reduce samples #
# had saved:Atlas_UMIs_DiffusionMap_2.RData
load("Atlas_UMIs_DiffusionMap_2.RData")
DC <- as.data.frame(eigenvectors(dif))

ggplot(DC, aes(x = DC1, y = DC2)) + geom_point(aes(color = cell.types[cell.types %in% 
    cells.2])) + scale_color_manual(values = brewer16) + scale_fill_discrete() + 
    theme(legend.title = element_blank()) + ggtitle("Enterocyte maturation")

Figure c,d

Cells are coloured by the expression (log2(TPM + 1)) of known and newly identified transcription factors associated with stages of differentiation (c), or with proximal or distal enterocyte differentiation (d)

Create a Function for Mean Log2TPM plot

gene.names <- rownames(atlas_tpm_norm)
plot_tpm <- function(gene, all.genes, cells, celltype, tpm.data, title = NULL, 
    DC.data = DC, DC.F = c(1, 2)) {
    # gene: the gene to caculate mean TPM expression all.genes: all genes of
    # data cells: the cells of sample to select celltype : all the sample cells
    # type tpm.data: the TPM data to use DC.data: data from DiffusionMap
    # function DC.F: the component to plot of DC.data
    # stopifnot(gene%in%rownames(tpm.data))
    gene.tpm <- tpm.data[all.genes %in% gene, celltype %in% cells]
    Logtpm <- as.numeric((apply(gene.tpm, 2, mean)))
    xlabel <- paste("DC", DC.F[1], sep = "-")
    ylabel <- paste("DC", DC.F[2], sep = "-")
    print(ggplot(DC.data, aes(x = DC[, DC.F[1]], y = DC[, DC.F[2]])) + geom_point(aes(color = Logtpm)) + 
        theme(legend.title = element_text(size = 8, color = "blue", face = "bold"), 
            legend.position = "right") + ggtitle(title) + theme_bw() + labs(x = xlabel, 
        y = ylabel) + scale_color_gradient2(low = "green", mid = "blue", high = "red", 
        name = "Log2\nTPM+1"))
    
    return(Logtpm)
}

Figure c

TFs along stem-maturation aixs
load("Atlas_UMIs_DiffusionMap_1.RData")
DC <- as.data.frame(eigenvectors(dif))
Sox4 <- plot_tpm(gene = "Sox4", all.genes = gene.names, cells = cells.1, celltype = cell.types, 
    tpm.data = atlas_tpm_norm, title = "Sox4(stem/TA)", DC.F = c(1, 3))

Foxm1 <- plot_tpm(gene = "Foxm1", all.genes = gene.names, cells = cells.1, celltype = cell.types, 
    tpm.data = atlas_tpm_norm, title = "Foxm1(progeniters)", DC.F = c(1, 3))

Mxd3 <- plot_tpm(gene = "Mxd3", all.genes = gene.names, cells = cells.1, celltype = cell.types, 
    tpm.data = atlas_tpm_norm, title = "Mxd3(progeniters)", DC.F = c(1, 3))

Batf2 <- plot_tpm(gene = "Batf2", all.genes = gene.names, cells = cells.1, celltype = cell.types, 
    tpm.data = atlas_tpm_norm, title = "Batf2(Immature Enterocyte)", DC.F = c(1, 
        3))

Figure d

Proximal vs distal TFs
load("Atlas_UMIs_DiffusionMap_2.RData")
DC <- as.data.frame(eigenvectors(dif))

Creb3l3 <- plot_tpm(gene = "Creb3l3", all.genes = gene.names, cells = cells.2, 
    celltype = cell.types, tpm.data = atlas_tpm_norm, title = "Creb3l3(proximal)")

Gata4 <- plot_tpm(gene = "Gata4", all.genes = gene.names, cells = cells.2, celltype = cell.types, 
    tpm.data = atlas_tpm_norm, title = "Gata4(proximal)")

Nr1i3 <- plot_tpm(gene = "Nr1i3", all.genes = gene.names, cells = cells.2, celltype = cell.types, 
    tpm.data = atlas_tpm_norm, title = "Nr1i3(proximal)")

Osr2 <- plot_tpm(gene = "Osr2", all.genes = gene.names, cells = cells.2, celltype = cell.types, 
    tpm.data = atlas_tpm_norm, title = "Osr2(distal)")

Jund <- plot_tpm(gene = "Jund", all.genes = gene.names, cells = cells.2, celltype = cell.types, 
    tpm.data = atlas_tpm_norm, title = "Jund(distal)")

Nr1h4 <- plot_tpm(gene = "Nr1h4", all.genes = gene.names, cells = cells.2, celltype = cell.types, 
    tpm.data = atlas_tpm_norm, title = "Nr1h4(distal)")

### Figure e Transcription factors that are differentially expressed between proximal and distal cell fate

cells.3 <- c("Enterocyte.Mature.Proximal", "Enterocyte.Mature.Distal")
genes.e <- unique(as.character(read.table("./Extend_data/Atlas.Matura.Proximal.Distal.txt")$V1))

Matura.Proximal.Distal.tpm <- Heatmap_fun(genes = genes.e, tpm.data = atlas_tpm_norm, 
    condition = cells.3, all.condition = cell.types)
## There ara 2 conditions
## Whether creat data accurate 0
aheatmap(x = Matura.Proximal.Distal.tpm[[2]], Colv = NA, Rowv = NA, annCol = Matura.Proximal.Distal.tpm[[1]])

Figure f

Regional_UMIs <- load_data("./Extend_data/GSE92332_Regional_UMIcounts.txt.gz")
## [1] "File exists!"
## 2017-12-25 09:16:01 INFO: Data dimensions: 27998x11665
Regional_UMIs <- Regional_UMIs[which(unlist(apply(Regional_UMIs, 1, sum)) > 
    0), ]
Regional_tpm <- data.frame(log2(1 + tpm(Regional_UMIs)))
## 2017-12-25 09:16:28 INFO: Running TPM normalisation
# region.var.genes<-get.variable.genes_cvdiff(Regional_tpm)

regional.v = get.variable.genes(Regional_UMIs, min.cv2 = 100)
## 2017-12-25 09:17:35 INFO: Fitting only the 10718 genes with mean expression > 0.0100128589798542

## 2017-12-25 09:17:38 INFO: Found 1926 variable genes (p<0.05)
regional.var.genes = as.character(rownames(regional.v)[regional.v$p.adj < 0.05])
regional.genes <- rownames(Regional_tpm)
region_groups <- unlist(lapply(colnames(Regional_tpm), function(x) return(str_split(x, 
    "_")[[1]][2])))

cell.groups <- unlist(lapply(colnames(Regional_tpm), function(x) return(str_split(x, 
    "_")[[1]][4])))
region.sample.names <- colnames(Regional_tpm)

# only use Region Stem Cells
# region.diffusion.matrix<-cbind(data.frame(Cell=region.sample.names[cell.groups%in%'Stem']),t(Regional_tpm[regional.var.genes,cell.groups%in%'Stem']))
# region.ct<-as.ExpressionSet(region.diffusion.matrix)
# region.dif<-DiffusionMap(region.ct) save(region.dif,file =
# 'Regional_UMIs_DiffusionMap_3.RData')
load("Regional_UMIs_DiffusionMap_3.RData")
DC <- as.data.frame(eigenvectors(region.dif))
Lgr5 <- plot_tpm(gene = "Lgr5", all.genes = regional.genes, cells = c("Stem"), 
    celltype = cell.groups, tpm.data = Regional_tpm, DC.data = DC, DC.F = c(1, 
        2), title = "Lgr5(Stem)")

Gkn3 <- plot_tpm(gene = "Gkn3", all.genes = regional.genes, cells = c("Stem"), 
    celltype = cell.groups, tpm.data = Regional_tpm, DC.data = DC, DC.F = c(1, 
        2), title = "Gkn3(Stem)")

Bex1 <- plot_tpm(gene = "Bex1", all.genes = regional.genes, cells = c("Stem"), 
    celltype = cell.groups, tpm.data = Regional_tpm, DC.data = DC, DC.F = c(1, 
        2), title = "Bex1(Stem)")